
## Generates gender variables for inventors
## Zihao Li. 06/2024

# All gender variables are based on count=100, prob=0.9. For other specifications, change varnames accordingly.
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk
import re

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

def extract_suffix(column_name):
    match = re.search(r'(io_)?\d{2}_\d{2,3}$', column_name)
    return match.group(0) if match else None

def main():
    ## Load gender data: g_inventor_gender_race_age.csv
    print('Loading gender data...')
    gender = parallel_read_csv(dir + 'cleandata/g_inventor_gender_race_age.csv', file_type='csv')

    # Define the columns to select and aggregate
    columns_to_aggregate = [
        'gender_09_100', 'gender_09_50', 'gender_08_100', 'gender_08_50', 
        'gender_io_09_100', 'gender_io_08_100', 'gender_io_07_100', 
        'gender_io_06_100', 'gender_io_05_100'
    ]
    # Create the aggregation dictionary
    agg_dict = {col: lambda x: x.tolist() for col in columns_to_aggregate}

    # Filter the dataframe to only keep the relevant columns, then group, aggregate, reset index, convert data type, and sort
    print('Begin aggregation...')
    gender_df = gender[['patent_id'] + columns_to_aggregate].groupby('patent_id').agg(agg_dict).reset_index().astype({'patent_id': str}).sort_values(by=['patent_id'])
    print('Shape of gender dataset (grouped by patent_id) is {}.'.format(gender_df.shape)) # (8256193, 10)

    ## Load patent-level data: g_patent_clean.csv
    print('Loading patent-level data...')
    df = parallel_read_csv(dir + 'cleandata/g_patent_clean.csv')
    print('Shape of patent-level data is {}.'.format(df.shape)) # (8256143, 58)

    df['patent_id'] = df['patent_id'].astype(str)
    gender_df['patent_id'] = gender_df['patent_id'].astype(str)

    ## Merger gender data with patent-level data
    print('Merging with patent-level data...')
    if df['patent_id'].nunique() == len(df) and gender_df['patent_id'].nunique() == len(gender_df):
        df = df.merge(gender_df, how='left', on='patent_id')
    else:
        print('Merge is not one-to-one')
    del gender_df
    df = df.dropna(subset=['gender_09_100'])
    print('Shape of merged dataset (after dropping NAN) is {}.'.format(df.shape)) # (8256143, 67)

    ## Generate gender variables
    print('Generating gender variables...')
    suffixes = [extract_suffix(col) for col in columns_to_aggregate]

    # Indicator variables
    for suffix in suffixes:
        col_name = f'gender_{suffix}'
        print(f"Column: {col_name}, Suffix: {suffix}")
        df[f'existfemale_{suffix}'] = [1 if 'female' in lst else 0 for lst in df[col_name]]
        df.loc[~df[col_name].isna(), f'allfemale_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('male' not in x[col_name]) & ('ambiguous' not in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'allmale_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('female' not in x[col_name]) & ('ambiguous' not in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'mixed_{suffix}'] = df.loc[~df[col_name].isna()].apply(lambda x: 1 if (('male' in x[col_name]) & ('female' in x[col_name])) else 0, axis=1).astype(int)
        df.loc[~df[col_name].isna(), f'undetermined_{suffix}'] = ((df[f'allfemale_{suffix}'] == 0) & (df[f'allmale_{suffix}'] == 0) & (df[f'mixed_{suffix}'] == 0)).astype(int)

    # Categorical genderdiscrete variable
    print('Gender discrete...')
    for suffix in suffixes:
        col_name = f'gender_{suffix}'
        print(f"Column: {col_name}, Suffix: {suffix}")
        df.loc[df[f'allfemale_{suffix}'] == 1, f'genderdiscrete_{suffix}'] = 'allfemale'
        df.loc[df[f'allmale_{suffix}'] == 1, f'genderdiscrete_{suffix}'] = 'allmale'
        df.loc[df[f'mixed_{suffix}'] == 1, f'genderdiscrete_{suffix}'] = 'mixed'
        df.loc[df[f'undetermined_{suffix}'] == 1, f'genderdiscrete_{suffix}'] = 'undetermined'

    # Leading inventor is female (indicator)
    print('Lead gender...')
    df['leadfemale_09_100'] = (df['lead_gender_09_100'] == 'female').astype(int)
    df['leadfemale_09_50'] = (df['lead_gender_09_50'] == 'female').astype(int)
    df['leadfemale_08_100'] = (df['lead_gender_08_100'] == 'female').astype(int)
    df['leadfemale_08_50'] = (df['lead_gender_08_50'] == 'female').astype(int)
    df['leadfemale_io_09_100'] = (df['lead_gender_io_09_100'] == 'female').astype(int)
    df['leadfemale_io_08_100'] = (df['lead_gender_io_08_100'] == 'female').astype(int)
    df['leadfemale_io_07_100'] = (df['lead_gender_io_07_100'] == 'female').astype(int)
    df['leadfemale_io_06_100'] = (df['lead_gender_io_06_100'] == 'female').astype(int)
    df['leadfemale_io_05_100'] = (df['lead_gender_io_05_100'] == 'female').astype(int)

    # Drop patent_id with letters, and convert patent_id to numeric
    print('Dropping non-numeric patent_id and converting to numeric...')
    df = df[~df["patent_id"].str.contains("[a-zA-Z]")]
    df["patent_id"] = pd.to_numeric(df["patent_id"], errors="raise")
    print('Shape of final dataset is {}.'.format(df.shape)) # (7481690, 130)

    rename_mapping = {col: f"{col}_list" for col in columns_to_aggregate}
    df.rename(columns=rename_mapping, inplace=True)

    print('Exporting the final dataset...')
    df.to_csv(dir + 'temp/g_patent_clean_final_g.csv', index=False)
    

if __name__ == '__main__':
    main()